#upload the bank data set
bank <- read.csv2("~/STUDIES/TBS Courses/DM course/raw data/bank.csv")
#view the head of data frame of the set loaded
head(bank)
## age job marital education default balance housing loan contact
## 1 30 unemployed married primary no 1787 no no cellular
## 2 33 services married secondary no 4789 yes yes cellular
## 3 35 management single tertiary no 1350 yes no cellular
## 4 30 management married tertiary no 1476 yes yes unknown
## 5 59 blue-collar married secondary no 0 yes no unknown
## 6 35 management single tertiary no 747 no no cellular
## day month duration campaign pdays previous poutcome y
## 1 19 oct 79 1 -1 0 unknown no
## 2 11 may 220 1 339 4 failure no
## 3 16 apr 185 1 330 1 failure no
## 4 3 jun 199 4 -1 0 unknown no
## 5 5 may 226 1 -1 0 unknown no
## 6 23 feb 141 2 176 3 failure no
#structure of the data set
str(bank)
## 'data.frame': 4521 obs. of 17 variables:
## $ age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
## $ education: Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
## $ contact : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : Factor w/ 12 levels "apr","aug","dec",..: 11 9 1 7 9 4 9 9 9 1 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
#create a data frame of numerical variables
banknum <- bank[,c("age","balance","day","duration","campaign","pdays","previous")]
#create a data frame of multinomial variables
bankcat <- bank[,c("job","marital","education","contact","month","poutcome")]
#create a data frame of binary variables
bankbin <- bank[,c("default","housing","loan","y")]
We will study each set of categories one by one, we will start by the banknum set We will use the pearson correlation to determine the relation between them, for descriptive statistics we use the function describe from the psych package as it gives us better insights on the normality of the variables and the anaomalies.
#Upload the necessary packages
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(Hmisc))
suppressPackageStartupMessages(library(psych))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(purrr))
we will also use the function multi.hist to perform a multiple histogram output with density plots. We will also conduct individual normality test, using the shapiro.test function.
attach(banknum)
psych::describe(banknum)
## vars n mean sd median trimmed mad min max range
## age 1 4521 41.17 10.58 39 40.48 10.38 19 87 68
## balance 2 4521 1422.66 3009.64 444 802.41 658.27 -3313 71188 74501
## day 3 4521 15.92 8.25 16 15.80 10.38 1 31 30
## duration 4 4521 263.96 259.86 185 216.44 143.81 4 3025 3021
## campaign 5 4521 2.79 3.11 2 2.14 1.48 1 50 49
## pdays 6 4521 39.77 100.12 -1 11.56 0.00 -1 871 872
## previous 7 4521 0.54 1.69 0 0.12 0.00 0 25 25
## skew kurtosis se
## age 0.70 0.35 0.16
## balance 6.59 88.25 44.76
## day 0.09 -1.04 0.12
## duration 2.77 12.51 3.86
## campaign 4.74 37.11 0.05
## pdays 2.72 7.94 1.49
## previous 5.87 51.91 0.03
rcorr(as.matrix(banknum))
## age balance day duration campaign pdays previous
## age 1.00 0.08 -0.02 0.00 -0.01 -0.01 0.00
## balance 0.08 1.00 -0.01 -0.02 -0.01 0.01 0.03
## day -0.02 -0.01 1.00 -0.02 0.16 -0.09 -0.06
## duration 0.00 -0.02 -0.02 1.00 -0.07 0.01 0.02
## campaign -0.01 -0.01 0.16 -0.07 1.00 -0.09 -0.07
## pdays -0.01 0.01 -0.09 0.01 -0.09 1.00 0.58
## previous 0.00 0.03 -0.06 0.02 -0.07 0.58 1.00
##
## n= 4521
##
##
## P
## age balance day duration campaign pdays previous
## age 0.0000 0.2301 0.8736 0.7293 0.5500 0.8134
## balance 0.0000 0.5597 0.2836 0.5025 0.5259 0.0782
## day 0.2301 0.5597 0.0978 0.0000 0.0000 0.0000
## duration 0.8736 0.2836 0.0978 0.0000 0.4853 0.2242
## campaign 0.7293 0.5025 0.0000 0.0000 0.0000 0.0000
## pdays 0.5500 0.5259 0.0000 0.4853 0.0000 0.0000
## previous 0.8134 0.0782 0.0000 0.2242 0.0000 0.0000
#we chose to not visualize the correlations because there's no significant ones
#prepfereably to visualize with ggplot; rectification required
multi.hist(banknum, bcol="orange",
dcol="blue")
shapiro.test(age)
##
## Shapiro-Wilk normality test
##
## data: age
## W = 0.95951, p-value < 2.2e-16
shapiro.test(balance)
##
## Shapiro-Wilk normality test
##
## data: balance
## W = 0.50151, p-value < 2.2e-16
shapiro.test(day)
##
## Shapiro-Wilk normality test
##
## data: day
## W = 0.96072, p-value < 2.2e-16
shapiro.test(duration)
##
## Shapiro-Wilk normality test
##
## data: duration
## W = 0.74754, p-value < 2.2e-16
shapiro.test(campaign)
##
## Shapiro-Wilk normality test
##
## data: campaign
## W = 0.56082, p-value < 2.2e-16
shapiro.test(pdays)
##
## Shapiro-Wilk normality test
##
## data: pdays
## W = 0.47041, p-value < 2.2e-16
shapiro.test(previous)
##
## Shapiro-Wilk normality test
##
## data: previous
## W = 0.35998, p-value < 2.2e-16
now we proceed to analyze the categorical variables in our data set. we will visualise with barplots and then we will conduct assosiation tests.
summary(bankcat)
## job marital education contact
## management :969 divorced: 528 primary : 678 cellular :2896
## blue-collar:946 married :2797 secondary:2306 telephone: 301
## technician :768 single :1196 tertiary :1350 unknown :1324
## admin. :478 unknown : 187
## services :417
## retired :230
## (Other) :713
## month poutcome
## may :1398 failure: 490
## jul : 706 other : 197
## aug : 633 success: 129
## jun : 531 unknown:3705
## nov : 389
## apr : 293
## (Other): 571
p1 <-ggplot(data = bankcat, aes(x = job)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <-ggplot(data = bankcat, aes(x = marital)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p3 <-ggplot(data = bankcat, aes(x = education)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p4 <-ggplot(data = bankcat, aes(x = contact)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p5 <-ggplot(data = bankcat, aes(x = month)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p6 <-ggplot(data = bankcat, aes(x = poutcome)) +
geom_bar(fill="orange", color="black") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p1;p2;p3;p4;p5;p6
grid.arrange(p1,p2,p3,p4,p5,p6,
top="Plot matrix of the categorical variables in the bank data set")
Now with the categorical and numerical variables have been visualized, it’s time to proceed to the binary data. In this section we will have simple barplot of the categories in each binary variable.
#summary of the binary data
summary(bankbin)
## default housing loan y
## no :4445 no :1962 no :3830 no :4000
## yes: 76 yes:2559 yes: 691 yes: 521
p11 <-ggplot(data = bankbin, aes(x=default)) +
geom_bar(fill="orange", color="black")
p12 <-ggplot(data = bankbin, aes(x=housing)) +
geom_bar(fill="orange", color="black")
p13 <-ggplot(data = bankbin, aes(x=loan)) +
geom_bar(fill="orange", color="black")
p14 <-ggplot(data = bankbin, aes(x=y)) +
geom_bar(fill="orange", color="black")
grid.arrange(p11,p12,p13,p14,
top = "Plot matrix of binary data of the bank dataset")
In this section we will try to explore the associations between each variable type, and than the between variable’s types ### Categorical In this subsection we will construct the contingency tables of possible combinations of the categorical variables, and then we will use these tables to conduct the chi2 test.
attach(bankcat)
t1 <-table(job, marital)
t2 <-table(job, education)
t3 <-table(job, contact)
t4 <-table(job, month)
t5 <-table(job, poutcome)
t6 <-table(marital, education)
t7 <-table(marital, contact)
t8 <-table(marital, month)
t9 <-table(marital, poutcome)
t10 <-table(education,contact)
t11 <-table(education,month)
t12 <-table(education,poutcome)
t13 <-table(contact,month)
t14 <-table(contact,poutcome)
t15 <-table(month,poutcome)
tables <- list(t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15)
map(tables, chisq.test)
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## Warning in .f(.x[[i]], ...): Chi-squared approximation may be incorrect
## [[1]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 373.18, df = 22, p-value < 2.2e-16
##
##
## [[2]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 2840, df = 33, p-value < 2.2e-16
##
##
## [[3]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 226.51, df = 22, p-value < 2.2e-16
##
##
## [[4]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 767.94, df = 121, p-value < 2.2e-16
##
##
## [[5]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 54.38, df = 33, p-value = 0.01097
##
##
## [[6]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 139.09, df = 6, p-value < 2.2e-16
##
##
## [[7]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 29.519, df = 4, p-value = 6.133e-06
##
##
## [[8]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 52.912, df = 22, p-value = 0.0002336
##
##
## [[9]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 6.9389, df = 6, p-value = 0.3265
##
##
## [[10]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 147.01, df = 6, p-value < 2.2e-16
##
##
## [[11]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 205.06, df = 33, p-value < 2.2e-16
##
##
## [[12]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 13.551, df = 9, p-value = 0.1392
##
##
## [[13]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 2401.1, df = 22, p-value < 2.2e-16
##
##
## [[14]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 377.92, df = 6, p-value < 2.2e-16
##
##
## [[15]]
##
## Pearson's Chi-squared test
##
## data: .x[[i]]
## X-squared = 693.97, df = 33, p-value < 2.2e-16
In this subsection we will explore the inter-binary association within the binary data. As usual, we construct the contingency tables, and then we conduct the chi2 test.
attach(bankbin)
t21 <-table(default, housing)
t22 <-table(default, loan)
t23 <-table(default, y)
t24 <-table(housing, loan)
t25 <-table(housing, y)
t26 <-table(loan, y)
tables1 <- list(t21, t22, t23, t24, t25, t26)
map(tables1, chisq.test)
## [[1]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 0.11967, df = 1, p-value = 0.7294
##
##
## [[2]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 17.157, df = 1, p-value = 3.441e-05
##
##
## [[3]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 1.1844e-27, df = 1, p-value = 1
##
##
## [[4]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 1.4374, df = 1, p-value = 0.2306
##
##
## [[5]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 48.885, df = 1, p-value = 2.715e-12
##
##
## [[6]]
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .x[[i]]
## X-squared = 21.872, df = 1, p-value = 2.915e-06
In this subsection we will explore the association between “y”, and the rest of the binary variables, this is mainly by conduction boxplots visualization using the ggplot2 package.
attach(bank)
## The following objects are masked from bankbin:
##
## default, housing, loan, y
## The following objects are masked from bankcat:
##
## contact, education, job, marital, month, poutcome
## The following objects are masked from banknum:
##
## age, balance, campaign, day, duration, pdays, previous
a1 <-ggplot(data = bank, aes(x= y, y= age))+
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a2 <-ggplot(data = bank, aes(x= y, y= balance))+
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a3 <-ggplot(data = bank, aes(x= y, y= day))+
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a4 <-ggplot(data = bank, aes(x= y, y= duration))+
geom_boxplot(fill= "orange", color="black")+
coord_flip()
a5 <-ggplot(data = bank, aes(x= y, y= campaign))+
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a6 <-ggplot(data = bank, aes(x= y, y= pdays)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a1
a2
a3
a4
a5
a6
grid.arrange(a1,a2,a3,a4,a5,a6,
top="Binary variables difference of groups regarding to 'y'")
“Housing” vs the rest og the numerical variables
attach(bank)
## The following objects are masked from bank (pos = 3):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bankbin:
##
## default, housing, loan, y
## The following objects are masked from bankcat:
##
## contact, education, job, marital, month, poutcome
## The following objects are masked from banknum:
##
## age, balance, campaign, day, duration, pdays, previous
a11 <-ggplot(data = bank, aes(x= housing, y= age)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a21 <-ggplot(data = bank, aes(x= housing, y= balance)) +
geom_boxplot(fill= "orange", color= "black") +
coord_flip()
a31 <-ggplot(data = bank, aes(x= housing, y= day)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a41 <-ggplot(data = bank, aes(x= housing, y= duration)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a51 <-ggplot(data = bank, aes(x= housing, y= campaign)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a61 <-ggplot(data = bank, aes(x= housing, y= pdays)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a11
a21
a31
a41
a51
a61
grid.arrange(a11,a21,a31,a41,a51,a61,
top= "Numerical data and their propreties comparing to housing")
“Loan” vs the rest of the numerical variables
attach(bank)
## The following objects are masked from bank (pos = 3):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 4):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bankbin:
##
## default, housing, loan, y
## The following objects are masked from bankcat:
##
## contact, education, job, marital, month, poutcome
## The following objects are masked from banknum:
##
## age, balance, campaign, day, duration, pdays, previous
a12 <-ggplot(data = bank, aes(x= loan, y= age)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a22 <-ggplot(data = bank, aes(x= loan, y= balance)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a32 <-ggplot(data = bank, aes(x= loan, y= day)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a42 <-ggplot(data = bank, aes(x= loan, y= duration)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a52 <-ggplot(data = bank, aes(x= loan, y= campaign)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a62 <-ggplot(data = bank, aes(x= loan, y= pdays)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a12
a22
a32
a42
a52
a62
grid.arrange(a12,a22,a32,a42,a52,a62,
top="numerical variables and their propreties comparing to 'loan'")
“default” vs the rest of the categorical variables
attach(bank)
## The following objects are masked from bank (pos = 3):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 4):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 5):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bankbin:
##
## default, housing, loan, y
## The following objects are masked from bankcat:
##
## contact, education, job, marital, month, poutcome
## The following objects are masked from banknum:
##
## age, balance, campaign, day, duration, pdays, previous
a13 <-ggplot(data = bank, aes(x= default, y= age)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a23 <-ggplot(data = bank, aes(x= default, y= balance)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a33 <-ggplot(data = bank, aes(x= default, y= day)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a43 <-ggplot(data = bank, aes(x= default, y= duration)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a53 <-ggplot(data = bank, aes(x= default, y= campaign)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a63 <-ggplot(data = bank, aes(x= default, y= pdays)) +
geom_boxplot(fill= "orange", color="black") +
coord_flip()
a13
a23
a33
a43
a53
a63
grid.arrange(a13,a23,a33,a43,a53,a63,
top ="the variable default in comparision to numerical variables")
In this subsection we will explore the numerical variables pair-wise while discriminating them according to the response in the variable “y”
“age” vs the rest of the numerical variables
attach(bank)
## The following objects are masked from bank (pos = 3):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 4):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 5):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bank (pos = 6):
##
## age, balance, campaign, contact, day, default, duration,
## education, housing, job, loan, marital, month, pdays,
## poutcome, previous, y
## The following objects are masked from bankbin:
##
## default, housing, loan, y
## The following objects are masked from bankcat:
##
## contact, education, job, marital, month, poutcome
## The following objects are masked from banknum:
##
## age, balance, campaign, day, duration, pdays, previous
b11 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b21 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b31 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b41 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b51 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b61 <-ggplot(data = bank, aes( y= age))+
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b11
b21
b31
b41
b51
b61
“balance” vs the rest of the numerical variables
b12 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b22 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b32 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b42 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b52 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b62 <-ggplot(data = bank, aes( y= balance)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b12
b22
b32
b42
b52
b62
“day” vs the rest of the numerical variables
b13 <-ggplot(data = bank, aes( y= day)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b23 <-ggplot(data = bank, aes( y=day)) +
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b33 <-ggplot(data = bank, aes( y= day))+
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b43 <-ggplot(data = bank, aes( y= day)) +
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b53 <-ggplot(data = bank, aes( y= day)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b63 <-ggplot(data = bank, aes( y= day)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b13
b23
b33
b43
b53
b63
“duration” vs the rest of the numerical variables
b14 <-ggplot(data = bank, aes( y= duration)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b24 <-ggplot(data = bank, aes( y= duration))+
geom_boxplot(aes(fill = y)) + facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b34 <-ggplot(data = bank, aes( y= duration)) +
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip()+
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b44 <-ggplot(data = bank, aes( y= duration)) +
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b54 <-ggplot(data = bank, aes( y= duration)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b64 <-ggplot(data = bank, aes( y= duration)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b14
b24
b34
b44
b54
b64
“campaign” vs the rest of the numerical variables
b15 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b25 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b35 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b45 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b55 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b65 <-ggplot(data = bank, aes( y= campaign)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b15
b25
b35
b45
b55
b65
“pdays” vs the rest of the numerical variables
b16 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b26 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b36 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b46 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b56 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b66 <-ggplot(data = bank, aes( y= pdays)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b16
b26
b36
b46
b56
b66
“previous” vs the rest of the numerical variables
b17 <-ggplot(data = bank, aes( y= previous)) +
geom_boxplot(aes(fill = y)) +
facet_grid(job~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b27 <-ggplot(data = bank, aes( y= previous))+
geom_boxplot(aes(fill = y)) +
facet_grid(marital~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b37 <-ggplot(data = bank, aes( y= previous))+
geom_boxplot(aes(fill = y)) +
facet_grid(education~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b47 <-ggplot(data = bank, aes( y= previous))+
geom_boxplot(aes(fill = y)) +
facet_grid(contact~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b57 <-ggplot(data = bank, aes( y= previous)) +
geom_boxplot(aes(fill = y)) +
facet_grid(month~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b67 <-ggplot(data = bank, aes( y= previous)) +
geom_boxplot(aes(fill = y)) +
facet_grid(poutcome~.) +
coord_flip() +
theme(strip.text.y = element_text(angle = 360, hjust = 1))
b17
b27
b37
b47
b57
b67